In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Object Creation

Creating a Series


In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s


Out[2]:
0     1
1     3
2     5
3   NaN
4     6
5     8
dtype: float64

Creating a DataFrame by passing a numpy array


In [5]:
df = pd.DataFrame(np.random.randn(6,4), index=pd.date_range('20130101', periods=6)
, columns=list('ABCD'))
df


Out[5]:
A B C D
2013-01-01 -0.614202 0.042476 -1.060533 0.281544
2013-01-02 1.602192 2.241403 -0.330993 -0.584948
2013-01-03 -2.402252 0.058440 0.595512 -1.895342
2013-01-04 1.489735 -2.288780 -0.274203 1.848972
2013-01-05 0.308056 0.135790 -1.773053 0.417936
2013-01-06 1.494492 0.233300 0.784898 0.475344

Creating a DataFrame by passing a dict of objects


In [8]:
df2 = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })
df2


Out[8]:
A B C D E F
0 1 2013-01-02 1 3 test foo
1 1 2013-01-02 1 3 train foo
2 1 2013-01-02 1 3 test foo
3 1 2013-01-02 1 3 train foo

Create a panel


In [25]:
wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
              major_axis=pd.date_range('1/1/2000', periods=5),
              minor_axis=['A', 'B', 'C', 'D'])
wp


Out[25]:
<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 5 (major_axis) x 4 (minor_axis)
Items axis: Item1 to Item2
Major_axis axis: 2000-01-01 00:00:00 to 2000-01-05 00:00:00
Minor_axis axis: A to D

Examining the dataframes


In [9]:
print df.dtypes
df2.dtypes


A    float64
B    float64
C    float64
D    float64
dtype: object
Out[9]:
A           float64
B    datetime64[ns]
C           float32
D             int32
E            object
F            object
dtype: object

In [10]:
df2.var


Out[10]:
<bound method DataFrame.var of    A          B  C  D      E    F
0  1 2013-01-02  1  3   test  foo
1  1 2013-01-02  1  3  train  foo
2  1 2013-01-02  1  3   test  foo
3  1 2013-01-02  1  3  train  foo>

In [11]:
df.T


Out[11]:
2013-01-01 00:00:00 2013-01-02 00:00:00 2013-01-03 00:00:00 2013-01-04 00:00:00 2013-01-05 00:00:00 2013-01-06 00:00:00
A -0.614202 1.602192 -2.402252 1.489735 0.308056 1.494492
B 0.042476 2.241403 0.058440 -2.288780 0.135790 0.233300
C -1.060533 -0.330993 0.595512 -0.274203 -1.773053 0.784898
D 0.281544 -0.584948 -1.895342 1.848972 0.417936 0.475344

In [12]:
df.all


Out[12]:
<bound method DataFrame.all of                    A         B         C         D
2013-01-01 -0.614202  0.042476 -1.060533  0.281544
2013-01-02  1.602192  2.241403 -0.330993 -0.584948
2013-01-03 -2.402252  0.058440  0.595512 -1.895342
2013-01-04  1.489735 -2.288780 -0.274203  1.848972
2013-01-05  0.308056  0.135790 -1.773053  0.417936
2013-01-06  1.494492  0.233300  0.784898  0.475344>

In [13]:
df.columns


Out[13]:
Index([u'A', u'B', u'C', u'D'], dtype='object')

In [27]:
df.index


Out[27]:
<class 'pandas.tseries.index.DatetimeIndex'>
[2013-01-01, ..., 2013-01-06]
Length: 6, Freq: D, Timezone: None

In [15]:
df[:2]


Out[15]:
A B C D
2013-01-01 -0.614202 0.042476 -1.060533 0.281544
2013-01-02 1.602192 2.241403 -0.330993 -0.584948

In [16]:
df[2:]


Out[16]:
A B C D
2013-01-03 -2.402252 0.05844 0.595512 -1.895342
2013-01-04 1.489735 -2.28878 -0.274203 1.848972
2013-01-05 0.308056 0.13579 -1.773053 0.417936
2013-01-06 1.494492 0.23330 0.784898 0.475344

In [26]:
print s.values
print df.values
wp.values


[  1.   3.   5.  nan   6.   8.]
[[-0.61420229  0.0424759  -1.06053291  0.28154368]
 [ 1.60219247  2.24140315 -0.33099339 -0.58494842]
 [-2.40225195  0.05843957  0.59551244 -1.89534179]
 [ 1.48973468 -2.28877975 -0.27420302  1.84897183]
 [ 0.30805595  0.13578978 -1.77305265  0.4179356 ]
 [ 1.49449215  0.23329978  0.78489767  0.47534374]]
Out[26]:
array([[[-0.55170998, -1.49479508, -1.59716869,  0.17833934],
        [-0.98767192, -0.26073382, -0.4474023 , -0.65722338],
        [ 0.03686406, -0.32351129, -0.32887511, -0.06545153],
        [ 1.16813252, -2.60819957, -1.27858635, -0.44541161],
        [-0.94140124, -0.77789381, -0.4541703 , -0.74971538]],

       [[-1.27040084, -1.17470843, -0.03466504,  1.15361768],
        [-1.51888802,  1.08107708,  0.93455097, -1.24181088],
        [ 1.47834859,  1.00578686, -0.25632817,  1.66887394],
        [ 0.49095915,  3.24024887, -0.50037795, -0.37857297],
        [ 1.45170457, -1.62691606,  0.43392764,  0.22069279]]])

In [28]:
df.describe()


Out[28]:
a b c d
count 6.000000 6.000000 6.000000 6.000000
mean 0.313004 0.070438 -0.343062 0.090584
std 1.592166 1.436017 0.971364 1.247705
min -2.402252 -2.288780 -1.773053 -1.895342
25% -0.383638 0.046467 -0.878148 -0.368325
50% 0.898895 0.097115 -0.302598 0.349740
75% 1.493303 0.208922 0.378084 0.460992
max 1.602192 2.241403 0.784898 1.848972

In [29]:
s.describe()


Out[29]:
count    5.000000
mean     4.600000
std      2.701851
min      1.000000
25%      3.000000
50%      5.000000
75%      6.000000
max      8.000000
dtype: float64

Selection

Getting columns


In [42]:
print df2.E
print df2['D']
df[['a','c']]


0     test
1    train
2     test
3    train
Name: E, dtype: object
0    3
1    3
2    3
3    3
Name: D, dtype: int32
Out[42]:
a c
2013-01-01 -0.614202 -1.060533
2013-01-02 1.602192 -0.330993
2013-01-03 -2.402252 0.595512
2013-01-04 1.489735 -0.274203
2013-01-05 0.308056 -1.773053
2013-01-06 1.494492 0.784898

Getting/Filtering rows


In [44]:
df[0:3]


Out[44]:
a b c d
2013-01-01 -0.614202 0.042476 -1.060533 0.281544
2013-01-02 1.602192 2.241403 -0.330993 -0.584948
2013-01-03 -2.402252 0.058440 0.595512 -1.895342

In [45]:
df['20130102':'20130104']


Out[45]:
a b c d
2013-01-02 1.602192 2.241403 -0.330993 -0.584948
2013-01-03 -2.402252 0.058440 0.595512 -1.895342
2013-01-04 1.489735 -2.288780 -0.274203 1.848972

In [48]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df


Out[48]:
A B C D
2013-01-01 1.435477 0.420006 0.912153 0.293857
2013-01-02 0.533736 0.195464 1.232967 1.012638
2013-01-03 0.029526 -0.953831 -1.020860 -1.524504
2013-01-04 -0.375673 -0.305868 0.283911 0.403089
2013-01-05 0.347365 1.098320 -0.401894 -0.892187
2013-01-06 -0.764138 0.759652 -0.407648 0.892738

In [70]:
print df.loc[dates[0]]
print df.loc[dates[0:2]]
print df.loc[dates[-1]]


A    1.435477
B    0.420006
C    0.912153
D    0.293857
Name: 2013-01-01 00:00:00, dtype: float64
                   A         B         C         D
2013-01-01  1.435477  0.420006  0.912153  0.293857
2013-01-02  0.533736  0.195464  1.232967  1.012638
A   -0.764138
B    0.759652
C   -0.407648
D    0.892738
Name: 2013-01-06 00:00:00, dtype: float64

In [68]:
df.iloc[-1]


Out[68]:
A   -0.764138
B    0.759652
C   -0.407648
D    0.892738
Name: 2013-01-06 00:00:00, dtype: float64

In [84]:
df


Out[84]:
A B C D
2013-01-01 1.435477 0.420006 0.912153 0.293857
2013-01-02 0.533736 0.195464 1.232967 1.012638
2013-01-03 0.029526 -0.953831 -1.020860 -1.524504
2013-01-04 -0.375673 -0.305868 0.283911 0.403089
2013-01-05 0.347365 1.098320 -0.401894 -0.892187
2013-01-06 -0.764138 0.759652 -0.407648 0.892738

In [82]:
df[df.A > 0]


Out[82]:
A B C D
2013-01-01 1.435477 0.420006 0.912153 0.293857
2013-01-02 0.533736 0.195464 1.232967 1.012638
2013-01-03 0.029526 -0.953831 -1.020860 -1.524504
2013-01-05 0.347365 1.098320 -0.401894 -0.892187

In [83]:
df[df > 0]


Out[83]:
A B C D
2013-01-01 1.435477 0.420006 0.912153 0.293857
2013-01-02 0.533736 0.195464 1.232967 1.012638
2013-01-03 0.029526 NaN NaN NaN
2013-01-04 NaN NaN 0.283911 0.403089
2013-01-05 0.347365 1.098320 NaN NaN
2013-01-06 NaN 0.759652 NaN 0.892738

Using isin()


In [89]:
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']
df2


Out[89]:
A B C D E
2013-01-01 1.435477 0.420006 0.912153 0.293857 one
2013-01-02 0.533736 0.195464 1.232967 1.012638 one
2013-01-03 0.029526 -0.953831 -1.020860 -1.524504 two
2013-01-04 -0.375673 -0.305868 0.283911 0.403089 three
2013-01-05 0.347365 1.098320 -0.401894 -0.892187 four
2013-01-06 -0.764138 0.759652 -0.407648 0.892738 three

In [90]:
df2[df2['E'].isin(['two','four'])]


Out[90]:
A B C D E
2013-01-03 0.029526 -0.953831 -1.020860 -1.524504 two
2013-01-05 0.347365 1.098320 -0.401894 -0.892187 four

Getting Rows and Cols


In [62]:
df.loc[:,['A','B']]


Out[62]:
A B
2013-01-01 1.435477 0.420006
2013-01-02 0.533736 0.195464
2013-01-03 0.029526 -0.953831
2013-01-04 -0.375673 -0.305868
2013-01-05 0.347365 1.098320
2013-01-06 -0.764138 0.759652

In [63]:
df.loc['20130102':'20130104',['A','B']]


Out[63]:
A B
2013-01-02 0.533736 0.195464
2013-01-03 0.029526 -0.953831
2013-01-04 -0.375673 -0.305868

In [64]:
df.loc['20130102',['A','B']]


Out[64]:
A    0.533736
B    0.195464
Name: 2013-01-02 00:00:00, dtype: float64

In [71]:
df.iloc[3:5,0:2]


Out[71]:
A B
2013-01-04 -0.375673 -0.305868
2013-01-05 0.347365 1.098320

In [73]:
df.iloc[[3,5],[0,2]]


Out[73]:
A C
2013-01-04 -0.375673 0.283911
2013-01-06 -0.764138 -0.407648

In [75]:
df


Out[75]:
A B C D
2013-01-01 1.435477 0.420006 0.912153 0.293857
2013-01-02 0.533736 0.195464 1.232967 1.012638
2013-01-03 0.029526 -0.953831 -1.020860 -1.524504
2013-01-04 -0.375673 -0.305868 0.283911 0.403089
2013-01-05 0.347365 1.098320 -0.401894 -0.892187
2013-01-06 -0.764138 0.759652 -0.407648 0.892738

In [76]:
df.iloc[1:3,:]


Out[76]:
A B C D
2013-01-02 0.533736 0.195464 1.232967 1.012638
2013-01-03 0.029526 -0.953831 -1.020860 -1.524504

In [77]:
df.iloc[:,1:3]


Out[77]:
B C
2013-01-01 0.420006 0.912153
2013-01-02 0.195464 1.232967
2013-01-03 -0.953831 -1.020860
2013-01-04 -0.305868 0.283911
2013-01-05 1.098320 -0.401894
2013-01-06 0.759652 -0.407648

Getting Scalars


In [80]:
print df.loc[dates[0],'A']
# Fast access:
print df.at[dates[0],'A']
print df.iloc[0,0]
# Fast access:
df.iat[0,0]


1.43547661814
1.43547661814
1.43547661814
Out[80]:
1.4354766181386678

Modification

Assigning to attributes


In [21]:
df.columns = [x.lower() for x in df.columns]
df


Out[21]:
a b c d
2013-01-01 -0.614202 0.042476 -1.060533 0.281544
2013-01-02 1.602192 2.241403 -0.330993 -0.584948
2013-01-03 -2.402252 0.058440 0.595512 -1.895342
2013-01-04 1.489735 -2.288780 -0.274203 1.848972
2013-01-05 0.308056 0.135790 -1.773053 0.417936
2013-01-06 1.494492 0.233300 0.784898 0.475344

Sorting

By Axis


In [32]:
df.sort_index(axis=1, ascending=False)


Out[32]:
d c b a
2013-01-01 0.281544 -1.060533 0.042476 -0.614202
2013-01-02 -0.584948 -0.330993 2.241403 1.602192
2013-01-03 -1.895342 0.595512 0.058440 -2.402252
2013-01-04 1.848972 -0.274203 -2.288780 1.489735
2013-01-05 0.417936 -1.773053 0.135790 0.308056
2013-01-06 0.475344 0.784898 0.233300 1.494492

By value


In [36]:
df.sort(columns='b')


Out[36]:
a b c d
2013-01-04 1.489735 -2.288780 -0.274203 1.848972
2013-01-01 -0.614202 0.042476 -1.060533 0.281544
2013-01-03 -2.402252 0.058440 0.595512 -1.895342
2013-01-05 0.308056 0.135790 -1.773053 0.417936
2013-01-06 1.494492 0.233300 0.784898 0.475344
2013-01-02 1.602192 2.241403 -0.330993 -0.584948

Setting


In [91]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1


Out[91]:
2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [92]:
df['F'] = s1
df


Out[92]:
A B C D F
2013-01-01 1.435477 0.420006 0.912153 0.293857 NaN
2013-01-02 0.533736 0.195464 1.232967 1.012638 1
2013-01-03 0.029526 -0.953831 -1.020860 -1.524504 2
2013-01-04 -0.375673 -0.305868 0.283911 0.403089 3
2013-01-05 0.347365 1.098320 -0.401894 -0.892187 4
2013-01-06 -0.764138 0.759652 -0.407648 0.892738 5

In [94]:
df.at[dates[0],'A'] = 0
df


Out[94]:
A B C D F
2013-01-01 0.000000 0.420006 0.912153 0.293857 NaN
2013-01-02 0.533736 0.195464 1.232967 1.012638 1
2013-01-03 0.029526 -0.953831 -1.020860 -1.524504 2
2013-01-04 -0.375673 -0.305868 0.283911 0.403089 3
2013-01-05 0.347365 1.098320 -0.401894 -0.892187 4
2013-01-06 -0.764138 0.759652 -0.407648 0.892738 5

In [95]:
df.iat[0,1] = 0
df


Out[95]:
A B C D F
2013-01-01 0.000000 0.000000 0.912153 0.293857 NaN
2013-01-02 0.533736 0.195464 1.232967 1.012638 1
2013-01-03 0.029526 -0.953831 -1.020860 -1.524504 2
2013-01-04 -0.375673 -0.305868 0.283911 0.403089 3
2013-01-05 0.347365 1.098320 -0.401894 -0.892187 4
2013-01-06 -0.764138 0.759652 -0.407648 0.892738 5

In [97]:
df.loc[:,'D'] = np.array([5] * len(df))
df


Out[97]:
A B C D F
2013-01-01 0.000000 0.000000 0.912153 5 NaN
2013-01-02 0.533736 0.195464 1.232967 5 1
2013-01-03 0.029526 -0.953831 -1.020860 5 2
2013-01-04 -0.375673 -0.305868 0.283911 5 3
2013-01-05 0.347365 1.098320 -0.401894 5 4
2013-01-06 -0.764138 0.759652 -0.407648 5 5

In [98]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2


Out[98]:
A B C D F
2013-01-01 0.000000 0.000000 -0.912153 -5 NaN
2013-01-02 -0.533736 -0.195464 -1.232967 -5 -1
2013-01-03 -0.029526 -0.953831 -1.020860 -5 -2
2013-01-04 -0.375673 -0.305868 -0.283911 -5 -3
2013-01-05 -0.347365 -1.098320 -0.401894 -5 -4
2013-01-06 -0.764138 -0.759652 -0.407648 -5 -5

Missing Data


In [108]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1


Out[108]:
A B C D F E
2013-01-01 0.000000 0.000000 0.912153 5 NaN NaN
2013-01-02 0.533736 0.195464 1.232967 5 1 NaN
2013-01-03 0.029526 -0.953831 -1.020860 5 2 NaN
2013-01-04 -0.375673 -0.305868 0.283911 5 3 NaN

In [109]:
df1.loc[dates[0]:dates[1],'E'] = 1
df1


Out[109]:
A B C D F E
2013-01-01 0.000000 0.000000 0.912153 5 NaN 1
2013-01-02 0.533736 0.195464 1.232967 5 1 1
2013-01-03 0.029526 -0.953831 -1.020860 5 2 NaN
2013-01-04 -0.375673 -0.305868 0.283911 5 3 NaN

Drop any row with missing data


In [110]:
df1.dropna(how='any')


Out[110]:
A B C D F E
2013-01-02 0.533736 0.195464 1.232967 5 1 1

Fill missing rows


In [112]:
df1.fillna(value=5)


Out[112]:
A B C D F E
2013-01-01 0.000000 0.000000 0.912153 5 5 1
2013-01-02 0.533736 0.195464 1.232967 5 1 1
2013-01-03 0.029526 -0.953831 -1.020860 5 2 5
2013-01-04 -0.375673 -0.305868 0.283911 5 3 5

Get the boolean mask where values are nan


In [113]:
pd.isnull(df1)


Out[113]:
A B C D F E
2013-01-01 False False False False True False
2013-01-02 False False False False False False
2013-01-03 False False False False False True
2013-01-04 False False False False False True

Get stats


In [117]:
df


Out[117]:
A B C D F
2013-01-01 0.000000 0.000000 0.912153 5 NaN
2013-01-02 0.533736 0.195464 1.232967 5 1
2013-01-03 0.029526 -0.953831 -1.020860 5 2
2013-01-04 -0.375673 -0.305868 0.283911 5 3
2013-01-05 0.347365 1.098320 -0.401894 5 4
2013-01-06 -0.764138 0.759652 -0.407648 5 5

In [139]:
df.describe(percentiles=[.05, .25, .75, .95])


Out[139]:
A B C D F
count 6.000000 6.000000 6.000000 6 5.000000
mean -0.038197 0.132290 0.099772 5 3.000000
std 0.473855 0.737554 0.865212 0 1.581139
min -0.764138 -0.953831 -1.020860 5 1.000000
5% -0.667022 -0.791840 -0.867557 5 1.200000
25% -0.281755 -0.229401 -0.406210 5 2.000000
50% 0.014763 0.097732 -0.058992 5 3.000000
75% 0.267905 0.618605 0.755093 5 4.000000
95% 0.487143 1.013653 1.152764 5 4.800000
max 0.533736 1.098320 1.232967 5 5.000000

In [114]:
df.mean()


Out[114]:
A   -0.038197
B    0.132290
C    0.099772
D    5.000000
F    3.000000
dtype: float64

In [116]:
df.mean(1)


Out[116]:
2013-01-01    1.478038
2013-01-02    1.592433
2013-01-03    1.010967
2013-01-04    1.520474
2013-01-05    2.008758
2013-01-06    1.917573
Freq: D, dtype: float64

Binary operation

Sub example


In [118]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s


Out[118]:
2013-01-01   NaN
2013-01-02   NaN
2013-01-03     1
2013-01-04     3
2013-01-05     5
2013-01-06   NaN
Freq: D, dtype: float64

In [120]:
print df
df.sub(s, axis='index')


                   A         B         C  D   F
2013-01-01  0.000000  0.000000  0.912153  5 NaN
2013-01-02  0.533736  0.195464  1.232967  5   1
2013-01-03  0.029526 -0.953831 -1.020860  5   2
2013-01-04 -0.375673 -0.305868  0.283911  5   3
2013-01-05  0.347365  1.098320 -0.401894  5   4
2013-01-06 -0.764138  0.759652 -0.407648  5   5
Out[120]:
A B C D F
2013-01-01 NaN NaN NaN NaN NaN
2013-01-02 NaN NaN NaN NaN NaN
2013-01-03 -0.970474 -1.953831 -2.020860 4 1
2013-01-04 -3.375673 -3.305868 -2.716089 2 0
2013-01-05 -4.652635 -3.901680 -5.401894 0 -1
2013-01-06 NaN NaN NaN NaN NaN

Applying functions to the data

Eample using numpy.cumsum

How numpy.cumsum works


In [125]:
a = np.array([[1,2,3], [4,5,6]])
a


Out[125]:
array([[1, 2, 3],
       [4, 5, 6]])

In [126]:
np.cumsum(a)


Out[126]:
array([ 1,  3,  6, 10, 15, 21])

In [127]:
np.cumsum(a,axis=0)


Out[127]:
array([[1, 2, 3],
       [5, 7, 9]])

In [128]:
np.cumsum(a,axis=1)


Out[128]:
array([[ 1,  3,  6],
       [ 4,  9, 15]])

Applying numpy.cumsum to a dataframe


In [130]:
print df
df.apply(np.cumsum)


                   A         B         C  D   F
2013-01-01  0.000000  0.000000  0.912153  5 NaN
2013-01-02  0.533736  0.195464  1.232967  5   1
2013-01-03  0.029526 -0.953831 -1.020860  5   2
2013-01-04 -0.375673 -0.305868  0.283911  5   3
2013-01-05  0.347365  1.098320 -0.401894  5   4
2013-01-06 -0.764138  0.759652 -0.407648  5   5
Out[130]:
A B C D F
2013-01-01 0.000000 0.000000 0.912153 5 NaN
2013-01-02 0.533736 0.195464 2.145121 10 1
2013-01-03 0.563262 -0.758367 1.124261 15 3
2013-01-04 0.187589 -1.064235 1.408172 20 6
2013-01-05 0.534954 0.034086 1.006278 25 10
2013-01-06 -0.229185 0.793738 0.598629 30 15

Applying a lambda function to a dataframe


In [136]:
df.apply(lambda x: x.max())


Out[136]:
A    0.533736
B    1.098320
C    1.232967
D    5.000000
F    5.000000
dtype: float64

In [138]:
df.apply(lambda x: x.min())


Out[138]:
A   -0.764138
B   -0.953831
C   -1.020860
D    5.000000
F    1.000000
dtype: float64

In [135]:
df.apply(lambda x: x.max() - x.min())


Out[135]:
A    1.297874
B    2.052151
C    2.253827
D    0.000000
F    4.000000
dtype: float64

Get value counts (histogram)


In [154]:
numpy.random.seed(3141)
s = pd.Series(np.random.randint(0, 7, size=10))
s


Out[154]:
0    0
1    1
2    2
3    6
4    3
5    1
6    5
7    1
8    5
9    5
dtype: int64

In [155]:
s.value_counts()


Out[155]:
5    3
1    3
6    1
3    1
2    1
0    1
dtype: int64

In [158]:
print df
df.apply(lambda x: x.value_counts())


                   A         B         C  D   F
2013-01-01  0.000000  0.000000  0.912153  5 NaN
2013-01-02  0.533736  0.195464  1.232967  5   1
2013-01-03  0.029526 -0.953831 -1.020860  5   2
2013-01-04 -0.375673 -0.305868  0.283911  5   3
2013-01-05  0.347365  1.098320 -0.401894  5   4
2013-01-06 -0.764138  0.759652 -0.407648  5   5
Out[158]:
A B C D F
-1.020860 NaN NaN 1 NaN NaN
-0.953831 NaN 1 NaN NaN NaN
-0.764138 1 NaN NaN NaN NaN
-0.407648 NaN NaN 1 NaN NaN
-0.401894 NaN NaN 1 NaN NaN
-0.375673 1 NaN NaN NaN NaN
-0.305868 NaN 1 NaN NaN NaN
0.000000 1 1 NaN NaN NaN
0.029526 1 NaN NaN NaN NaN
0.195464 NaN 1 NaN NaN NaN
0.283911 NaN NaN 1 NaN NaN
0.347365 1 NaN NaN NaN NaN
0.533736 1 NaN NaN NaN NaN
0.759652 NaN 1 NaN NaN NaN
0.912153 NaN NaN 1 NaN NaN
1.000000 NaN NaN NaN NaN 1
1.098320 NaN 1 NaN NaN NaN
1.232967 NaN NaN 1 NaN NaN
2.000000 NaN NaN NaN NaN 1
3.000000 NaN NaN NaN NaN 1
4.000000 NaN NaN NaN NaN 1
5.000000 NaN NaN NaN 6 1

Strings


In [160]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s


Out[160]:
0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [165]:
s.str.lower()


Out[165]:
0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [167]:
df2 = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })
df2


Out[167]:
A B C D E F
0 1 2013-01-02 1 3 test foo
1 1 2013-01-02 1 3 train foo
2 1 2013-01-02 1 3 test foo
3 1 2013-01-02 1 3 train foo

In [169]:
df2.E.str.lower()


Out[169]:
0     test
1    train
2     test
3    train
Name: E, dtype: object

Seperating and Merging


In [170]:
df = pd.DataFrame(np.random.randn(10, 4))
df


Out[170]:
0 1 2 3
0 -0.780990 -0.681411 -0.761773 -0.119461
1 0.099370 1.817058 -1.432180 -1.668737
2 -0.081338 -1.200262 0.301148 0.226868
3 0.552045 0.211555 1.113250 -0.135035
4 0.378446 -1.134929 -0.182837 -0.535175
5 -1.292433 -1.283620 0.882145 -0.822631
6 1.282259 1.795561 0.417589 -0.957859
7 0.115630 0.800898 -2.971596 -1.173003
8 0.813187 -0.924544 -1.515630 -1.392925
9 -1.209643 -0.061122 0.317282 0.508997

In [172]:
pieces = [df[:3], df[3:7], df[7:]]
pieces


Out[172]:
[          0         1         2         3
 0 -0.780990 -0.681411 -0.761773 -0.119461
 1  0.099370  1.817058 -1.432180 -1.668737
 2 -0.081338 -1.200262  0.301148  0.226868,
           0         1         2         3
 3  0.552045  0.211555  1.113250 -0.135035
 4  0.378446 -1.134929 -0.182837 -0.535175
 5 -1.292433 -1.283620  0.882145 -0.822631
 6  1.282259  1.795561  0.417589 -0.957859,
           0         1         2         3
 7  0.115630  0.800898 -2.971596 -1.173003
 8  0.813187 -0.924544 -1.515630 -1.392925
 9 -1.209643 -0.061122  0.317282  0.508997]

In [173]:
pd.concat(pieces)


Out[173]:
0 1 2 3
0 -0.780990 -0.681411 -0.761773 -0.119461
1 0.099370 1.817058 -1.432180 -1.668737
2 -0.081338 -1.200262 0.301148 0.226868
3 0.552045 0.211555 1.113250 -0.135035
4 0.378446 -1.134929 -0.182837 -0.535175
5 -1.292433 -1.283620 0.882145 -0.822631
6 1.282259 1.795561 0.417589 -0.957859
7 0.115630 0.800898 -2.971596 -1.173003
8 0.813187 -0.924544 -1.515630 -1.392925
9 -1.209643 -0.061122 0.317282 0.508997

SQL-style


In [175]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
left


Out[175]:
key lval
0 foo 1
1 foo 2

In [176]:
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
right


Out[176]:
key rval
0 foo 4
1 foo 5

In [177]:
pd.merge(left, right, on='key')


Out[177]:
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5

In [178]:
pd.merge(right, left, on='key')


Out[178]:
key rval lval
0 foo 4 1
1 foo 4 2
2 foo 5 1
3 foo 5 2

Appending


In [179]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df


Out[179]:
A B C D
0 -0.908767 -0.050439 -0.835676 -0.373857
1 0.709716 0.273589 -1.559243 2.308157
2 0.600050 -0.707956 1.365856 1.678989
3 -0.537913 0.886842 -0.045236 -0.659609
4 -0.943781 1.252410 -1.286733 0.322001
5 0.886099 0.184100 0.786995 0.157009
6 0.207511 1.646812 2.117746 0.190764
7 1.104337 0.520280 1.177717 1.581926

In [180]:
s = df.iloc[3]
s


Out[180]:
A   -0.537913
B    0.886842
C   -0.045236
D   -0.659609
Name: 3, dtype: float64

In [182]:
df.append(s)


Out[182]:
A B C D
0 -0.908767 -0.050439 -0.835676 -0.373857
1 0.709716 0.273589 -1.559243 2.308157
2 0.600050 -0.707956 1.365856 1.678989
3 -0.537913 0.886842 -0.045236 -0.659609
4 -0.943781 1.252410 -1.286733 0.322001
5 0.886099 0.184100 0.786995 0.157009
6 0.207511 1.646812 2.117746 0.190764
7 1.104337 0.520280 1.177717 1.581926
3 -0.537913 0.886842 -0.045236 -0.659609

In [181]:
df.append(s, ignore_index=True)


Out[181]:
A B C D
0 -0.908767 -0.050439 -0.835676 -0.373857
1 0.709716 0.273589 -1.559243 2.308157
2 0.600050 -0.707956 1.365856 1.678989
3 -0.537913 0.886842 -0.045236 -0.659609
4 -0.943781 1.252410 -1.286733 0.322001
5 0.886099 0.184100 0.786995 0.157009
6 0.207511 1.646812 2.117746 0.190764
7 1.104337 0.520280 1.177717 1.581926
8 -0.537913 0.886842 -0.045236 -0.659609

Grouping

Splitting the data into groups, Applying a function to each group, Combining the results into a data structure

In the apply step, we might wish to one of the following:

    Aggregation: computing a summary statistic (or statistics) about each group. Some examples:

            Compute group sums or means
            Compute group sizes / counts

    Transformation: perform some group-specific computations and return a like-indexed. Some examples:

            Standardizing data (zscore) within group
            Filling NAs within groups with a value derived from each group

    Filtration: discard some groups, according to a group-wise computation that evaluates True or False. Some examples:

            Discarding data that belongs to groups with only a few members
            Filtering out data based on the group sum or mean

    Some combination of the above: GroupBy will examine the results of the apply step and try to return a sensibly combined result if it doesn’t fit into either of the above two categories

In [189]:
numpy.random.seed(3141)
df = pd.DataFrame({
   'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
   'B' : ['one', 'one', 'two', 'three',
          'two', 'two', 'one', 'three'],
   'C' : np.random.randn(8),
   'D' : np.random.randn(8)}
                  )
df


Out[189]:
A B C D
0 foo one 0.227769 -0.143393
1 bar one -0.755529 0.871788
2 foo two 1.144946 0.332175
3 bar three -0.352005 0.591673
4 foo two -0.482710 -2.419224
5 bar two 0.655263 0.254834
6 foo one 0.632421 -1.100392
7 foo three -0.622162 -0.307889

Basic grouping


In [206]:
print df.groupby('A').count()
print df.groupby('B').count()
print df.groupby('C').count()
print df.groupby('D').count()
print df.groupby(['A', 'B']).count()


     B  C  D
A           
bar  3  3  3
foo  5  5  5
       A  C  D
B             
one    3  3  3
three  2  2  2
two    3  3  3
           A  B  D
C                 
-0.755529  1  1  1
-0.622162  1  1  1
-0.482710  1  1  1
-0.352005  1  1  1
 0.227769  1  1  1
 0.632421  1  1  1
 0.655263  1  1  1
 1.144946  1  1  1
           A  B  C
D                 
-2.419224  1  1  1
-1.100392  1  1  1
-0.307889  1  1  1
-0.143393  1  1  1
 0.254834  1  1  1
 0.332175  1  1  1
 0.591673  1  1  1
 0.871788  1  1  1
           C  D
A   B          
bar one    1  1
    three  1  1
    two    1  1
foo one    2  2
    three  1  1
    two    2  2

Split by the columns


In [217]:
def get_letter_type(letter):
    if letter.lower() in 'aeiou':
        return 'vowel'
    else:
        return 'consonant'
df.groupby(get_letter_type, axis=1).sum()


Out[217]:
consonant vowel
0 0.084376 foo
1 0.116259 bar
2 1.477121 foo
3 0.239669 bar
4 -2.901935 foo
5 0.910097 bar
6 -0.467971 foo
7 -0.930051 foo

Groups


In [218]:
df.groupby('A').groups


Out[218]:
{'bar': [1, 3, 5], 'foo': [0, 2, 4, 6, 7]}

In [219]:
df.groupby(get_letter_type, axis=1).groups


Out[219]:
{'consonant': ['B', 'C', 'D'], 'vowel': ['A']}

In [220]:
df.groupby(['A', 'B']).groups


Out[220]:
{('bar', 'one'): [1],
 ('bar', 'three'): [3],
 ('bar', 'two'): [5],
 ('foo', 'one'): [0, 6],
 ('foo', 'three'): [7],
 ('foo', 'two'): [2, 4]}

In [221]:
len(df.groupby(['A', 'B']).groups)


Out[221]:
6

In [239]:
def printgroups(groups):
    for name, group in groups:
        print(name)
        print(group)
def printgroups2(groups):
    for (k1, k2), group in groups:
        print(k1)
        print(k2)
        print(group)

In [241]:
print df
printgroups(df.groupby(['A']))
printgroups(df.groupby(['B','A']))
printgroups2(df.groupby(['B','A']))


     A      B         C         D
0  foo    one  0.227769 -0.143393
1  bar    one -0.755529  0.871788
2  foo    two  1.144946  0.332175
3  bar  three -0.352005  0.591673
4  foo    two -0.482710 -2.419224
5  bar    two  0.655263  0.254834
6  foo    one  0.632421 -1.100392
7  foo  three -0.622162 -0.307889
bar
     A      B         C         D
1  bar    one -0.755529  0.871788
3  bar  three -0.352005  0.591673
5  bar    two  0.655263  0.254834
foo
     A      B         C         D
0  foo    one  0.227769 -0.143393
2  foo    two  1.144946  0.332175
4  foo    two -0.482710 -2.419224
6  foo    one  0.632421 -1.100392
7  foo  three -0.622162 -0.307889
('one', 'bar')
     A    B         C         D
1  bar  one -0.755529  0.871788
('one', 'foo')
     A    B         C         D
0  foo  one  0.227769 -0.143393
6  foo  one  0.632421 -1.100392
('three', 'bar')
     A      B         C         D
3  bar  three -0.352005  0.591673
('three', 'foo')
     A      B         C         D
7  foo  three -0.622162 -0.307889
('two', 'bar')
     A    B         C         D
5  bar  two  0.655263  0.254834
('two', 'foo')
     A    B         C         D
2  foo  two  1.144946  0.332175
4  foo  two -0.482710 -2.419224
one
bar
     A    B         C         D
1  bar  one -0.755529  0.871788
one
foo
     A    B         C         D
0  foo  one  0.227769 -0.143393
6  foo  one  0.632421 -1.100392
three
bar
     A      B         C         D
3  bar  three -0.352005  0.591673
three
foo
     A      B         C         D
7  foo  three -0.622162 -0.307889
two
bar
     A    B         C         D
5  bar  two  0.655263  0.254834
two
foo
     A    B         C         D
2  foo  two  1.144946  0.332175
4  foo  two -0.482710 -2.419224

Group selection


In [243]:
grouped = df.groupby('A')
grouped.get_group('bar')


Out[243]:
A B C D
1 bar one -0.755529 0.871788
3 bar three -0.352005 0.591673
5 bar two 0.655263 0.254834

In [244]:
df.groupby(['A', 'B']).get_group(('bar', 'one'))


Out[244]:
A B C D
1 bar one -0.755529 0.871788

Aggregation


In [249]:
df


Out[249]:
A B C D
0 foo one 0.227769 -0.143393
1 bar one -0.755529 0.871788
2 foo two 1.144946 0.332175
3 bar three -0.352005 0.591673
4 foo two -0.482710 -2.419224
5 bar two 0.655263 0.254834
6 foo one 0.632421 -1.100392
7 foo three -0.622162 -0.307889

In [250]:
df.groupby('A').sum()


Out[250]:
C D
A
bar -0.452270 1.718295
foo 0.900263 -3.638724

In [251]:
df.groupby(['A','B']).sum()


Out[251]:
C D
A B
bar one -0.755529 0.871788
three -0.352005 0.591673
two 0.655263 0.254834
foo one 0.860190 -1.243786
three -0.622162 -0.307889
two 0.662236 -2.087049

In [252]:
df.groupby(['B','A']).sum()


Out[252]:
C D
B A
one bar -0.755529 0.871788
foo 0.860190 -1.243786
three bar -0.352005 0.591673
foo -0.622162 -0.307889
two bar 0.655263 0.254834
foo 0.662236 -2.087049

In [253]:
grouped = df.groupby('A')
grouped.aggregate(np.sum)


Out[253]:
C D
A
bar -0.452270 1.718295
foo 0.900263 -3.638724

In [254]:
grouped = df.groupby(['A', 'B'], as_index=False)
grouped.aggregate(np.sum)


Out[254]:
A B C D
0 bar one -0.755529 0.871788
1 bar three -0.352005 0.591673
2 bar two 0.655263 0.254834
3 foo one 0.860190 -1.243786
4 foo three -0.622162 -0.307889
5 foo two 0.662236 -2.087049

In [256]:
df.groupby(['A', 'B']).sum().reset_index()


Out[256]:
A B C D
0 bar one -0.755529 0.871788
1 bar three -0.352005 0.591673
2 bar two 0.655263 0.254834
3 foo one 0.860190 -1.243786
4 foo three -0.622162 -0.307889
5 foo two 0.662236 -2.087049

In [255]:
df.groupby('A', as_index=False).sum()


Out[255]:
A C D
0 bar -0.452270 1.718295
1 foo 0.900263 -3.638724

In [257]:
df.groupby(['A', 'B']).size()


Out[257]:
A    B    
bar  one      1
     three    1
     two      1
foo  one      2
     three    1
     two      2
dtype: int64

In [258]:
df.groupby(['A', 'B']).describe()


Out[258]:
C D
A B
bar one count 1.000000 1.000000
mean -0.755529 0.871788
std NaN NaN
min -0.755529 0.871788
25% -0.755529 0.871788
50% -0.755529 0.871788
75% -0.755529 0.871788
max -0.755529 0.871788
three count 1.000000 1.000000
mean -0.352005 0.591673
std NaN NaN
min -0.352005 0.591673
25% -0.352005 0.591673
50% -0.352005 0.591673
75% -0.352005 0.591673
max -0.352005 0.591673
two count 1.000000 1.000000
mean 0.655263 0.254834
std NaN NaN
min 0.655263 0.254834
25% 0.655263 0.254834
50% 0.655263 0.254834
75% 0.655263 0.254834
max 0.655263 0.254834
foo one count 2.000000 2.000000
mean 0.430095 -0.621893
std 0.286132 0.676700
min 0.227769 -1.100392
25% 0.328932 -0.861142
50% 0.430095 -0.621893
75% 0.531258 -0.382643
max 0.632421 -0.143393
three count 1.000000 1.000000
mean -0.622162 -0.307889
std NaN NaN
min -0.622162 -0.307889
25% -0.622162 -0.307889
50% -0.622162 -0.307889
75% -0.622162 -0.307889
max -0.622162 -0.307889
two count 2.000000 2.000000
mean 0.331118 -1.043525
std 1.150927 1.945533
min -0.482710 -2.419224
25% -0.075796 -1.731374
50% 0.331118 -1.043525
75% 0.738032 -0.355675
max 1.144946 0.332175

In [270]:
grouped = df.groupby('A')
grouped.groups


Out[270]:
{'bar': [1, 3, 5], 'foo': [0, 2, 4, 6, 7]}

In [271]:
grouped.agg([np.sum, np.mean, np.std]).reset_index()


Out[271]:
A C D
sum mean std sum mean std
0 bar -0.452270 -0.150757 0.726608 1.718295 0.572765 0.308912
1 foo 0.900263 0.180053 0.745107 -3.638724 -0.727745 1.077166

In [272]:
grouped['C'].agg([np.sum, np.mean, np.std]).reset_index()


Out[272]:
A sum mean std
0 bar -0.452270 -0.150757 0.726608
1 foo 0.900263 0.180053 0.745107

In [275]:
grouped['D'].agg({'Sum' : np.sum, 'Mean' : np.mean}).reset_index()


Out[275]:
A Sum Mean
0 bar 1.718295 0.572765
1 foo -3.638724 -0.727745

In [277]:
grouped.agg([np.sum, np.mean, np.std]).reset_index()


Out[277]:
A C D
sum mean std sum mean std
0 bar -0.452270 -0.150757 0.726608 1.718295 0.572765 0.308912
1 foo 0.900263 0.180053 0.745107 -3.638724 -0.727745 1.077166

In [281]:
grouped.agg({'C' : np.sum, 'D' : lambda x: np.std(x, ddof=1)}).reset_index()


Out[281]:
A C D
0 bar -0.452270 0.308912
1 foo 0.900263 1.077166

In [282]:
grouped.agg({'C' : 'sum', 'D' : 'std'}).reset_index()


Out[282]:
A C D
0 bar -0.452270 0.308912
1 foo 0.900263 1.077166

In [283]:
df.groupby('A').sum().reset_index()


Out[283]:
A C D
0 bar -0.452270 1.718295
1 foo 0.900263 -3.638724

Transformation


In [286]:
index = pd.date_range('10/1/1999', periods=1100)
ts = pd.Series(np.random.normal(0.5, 2, 1100), index)
print ts.head()
ts = pd.rolling_mean(ts, 100, 100).dropna()
print ts.head()


1999-10-01   -0.897038
1999-10-02   -3.218710
1999-10-03   -2.404339
1999-10-04   -1.599611
1999-10-05   -1.453354
Freq: D, dtype: float64
2000-01-08    0.204654
2000-01-09    0.222684
2000-01-10    0.240081
2000-01-11    0.279773
2000-01-12    0.320368
Freq: D, dtype: float64

In [290]:
ts.tail()


Out[290]:
2002-09-30    0.641651
2002-10-01    0.606286
2002-10-02    0.637348
2002-10-03    0.623423
2002-10-04    0.611509
Freq: D, dtype: float64

In [295]:
key = lambda x: x.year
zscore = lambda x: (x - x.mean()) / x.std()
transformed = ts.groupby(key).transform(zscore)

In [296]:
grouped = ts.groupby(key)
grouped.mean()


Out[296]:
2000    0.430660
2001    0.658088
2002    0.525698
dtype: float64

In [297]:
grouped.std()


Out[297]:
2000    0.088895
2001    0.098300
2002    0.157117
dtype: float64

In [298]:
grouped_trans = transformed.groupby(key)

In [299]:
grouped_trans.mean()


Out[299]:
2000    1.124449e-15
2001    6.428267e-15
2002   -3.755520e-16
dtype: float64

In [300]:
grouped_trans.std()


Out[300]:
2000    1
2001    1
2002    1
dtype: float64

In [301]:
compare = pd.DataFrame({'Original': ts, 'Transformed': transformed})

In [302]:
compare.plot()


Out[302]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9f9acacb90>

In [ ]: